########################################################
## This code carries out parameter tuning 
## for the model that excludes individuals who entered into training programs.
########################################################

########################################################
## Preparation of the workspace
########################################################

## remove all objects from the current workspace
rm(list=ls())

## load the required packages
library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)
library(VIM)
library(base)
library(ranger)
library(glmnet)
library(xgboost)

## display the current time -> to check how much time it takes to run the code
start_time = Sys.time()

## set the directories
main <- "placeholder_main"

dataDirectory <- paste0(main, "/Data")
RDirectory <- paste0(main, "/Programs/Output Generation")

## Load the tuning and training functions:
source(paste0(RDirectory, "/102_0_caret_parameter_tuning_Function.R"))
source(paste0(RDirectory, "/103_0_caret_predictions_Function.R"))



########################################################
## Define the inputs for the loop
########################################################
 
# Define the outcome variables that are predicted
dependent <- c("emplAft6M_0M_In")
seed <- 2111

## Load the data set:
data <- read_dta(paste0(dataDirectory, "/002_DataForR_Full_NoTraining_2006.dta"))

## Data cleaning:
y <- dependent

# Correcting format of outcome variable:
data[[y]] <- factor(data[[y]])
levels(data[[y]]) <- c("no", "yes")

# Keeping only those observations with non-missing outcome variables
data <- data[complete.cases(data[, y]), ] 

## Old way to select variables:
first_column <- which(colnames(data)=="Gender") ## Identifying column number where the variables of interest start
y_column <- which(colnames(data) == dependent) ## Column number with the dependent variable of interest

data.in <- data[, c(y_column, first_column:ncol(data))] ## Keeping all the possible covariates we want to have in the model


data_tune <- data.in[data$samp == 1 & data$training_combined_6months == 0,]
data_train <- data.in[data$samp == 2 & data$training_combined_6months == 0,]
data_pred <- data.in[data$samp %in% c(3, 4, 5),]
persinfo <- data[data$samp %in% c(3, 4, 5), c("LopNr_PersonNr", "InLnr", "n", "samp", "training_combined_6months")]


## Run the tuning function to get the hyperparameters:
cat("--> Tuning")
parameters <- tuning.int(data = data_tune, 
                         dependent = y,
                         seed = seed, noisily = TRUE)

write.csv(parameters$rfgrid_final, file = paste0(dataDirectory,"/102_rfgrid_Full_NoTraining", dependent, "_2006.csv"), row.names = FALSE)
write.csv(parameters$boostgrid_final, file = paste0(dataDirectory,"/102_boostgrid_NoTraining", dependent, "_2006.csv"), row.names = FALSE)
write.csv(parameters$lassogrid_final, file = paste0(dataDirectory,"/102_lassogrid_NoTraining", dependent, "_2006.csv"), row.names = FALSE)


## Run the training function to train the three algorithms:
cat("--> Training")
models <- training.int(data = data_train, 
                       dependent = y, 
                       parameters = parameters,
                       seed = seed, noisily = TRUE)


## Save models
save(models, 
     file=paste0(dataDirectory,"/103_Models_NoTraining", dependent, "_2006.rda"))


## Finally, get the predictions:
cat("--> Predicting")

# Split data into dependent and independent variables:
total_y <- data_pred[[dependent]]
total_x <- data_pred[, setdiff(names(data_pred), dependent)]

## Create predictions from the models

## Creating predictions Random Forest
pred_rf <- predict(models$rff_final, total_x)
pred_rf <- pred_rf[["predictions"]]
pred_rf <- data.frame(pred_rf[, 2])

## Creating predictions Gradient Boost
pred_boost <- predict(models$rboost_final, data.matrix(total_x))

## Creating predictions LASSO
pred_lasso <- predict(models$rlasso_final, data.matrix(total_x), type = "response")

# Merge with personal information:
predictions <- data.frame(total_y, pred_rf, pred_boost, pred_lasso)

# Rename predictions:
colnames(predictions) <- c(y, 
                           paste0("p_", y, "_rf"),
                           paste0("p_", y, "_boost"),
                           paste0("p_", y, "_lasso"))

output <- cbind(persinfo, predictions)

## Save predictions
write.csv(output, 
          file=paste0(dataDirectory,"/103_predictionsR_Full_NoTraining_", dependent, "_2006.csv"))

